Objectif : Analyse économétrique : exploration des données, modélisations et prévisions
Date début de l’analyse : 17 mai 2022
Date de la dernière modification : 13 juillet 2022
# Packages
packages = c("tidyverse", "glue", "kableExtra", "knitr", "plotly", "hrbrthemes", "gridExtra", "stats", "arrow", "lme4", "tidymodels", "broom.mixed", "multilevelmod", "dotwhisker", "rAmCharts", "DescTools", "readxl")
## Installation des packages si besoin et chargement des librairies
package.check <- lapply(packages,
FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)}})
# Import BSO complet (2013-2020)
data <- read_csv("../data/4.process/managing-variables_2nd-part/whole_bso.csv", col_types = cols(author_position = "c"), na = c("", "NA"))
# Création de nouvelles variables : couleur de journal finale et APC fiables
data <- data %>% mutate(oa_color_finale = case_when(oa_color_article_BSO == "diamond" ~ "diamond",
oa_color_openalex == "gold" ~ "gold",
oa_color_openalex == "hybrid" ~ "hybridOA",
TRUE ~ "other"),
amount_apc_EUR_trusted = case_when(apc_source == "openAPC_estimation_publisher" | apc_source == "openAPC_estimation_publisher_year" ~ NA_real_,
TRUE ~ amount_apc_EUR))
Pour les modélisations, nous cherchons à expliquer et prédire le montant des APC (Y) avec 2 variables potentielles :
L’analyse exploratoire nous dira quelle variable des APC est à retenir pour inclure dans les modèles. Dans tous les cas, les variables explicatives utilisées pour prédire Y seront “oa_color_finale”, “tier”, “bso_classification”, “is_french_CA_bso_wos_openalex_single_lang”, “year” et “is_covered_by_couperin”.
Pour préparer les données nous créons donc les variables
“oa_color_finale” et “amount_apc_EUR_trusted”, puis
nous sélectionnons les 6 variables qui entrent dans les modèles en
supprimant les valeurs manquantes, et enfin nous filtrons sur les
articles pour lesquels des APC ont été payés
(apc_has_been_paid == 1, amount_apc_EUR_trusted > 0).
# Sélection de variables pour les modèles
bso_mvp <- data %>%
select(year, bso_classification, apc_has_been_paid, tier, oa_color_finale, is_french_CA_bso_wos_openalex_single_lang, amount_apc_EUR, amount_apc_EUR_trusted, is_covered_by_couperin) %>%
mutate(bso_classification = as.factor(bso_classification),
apc_has_been_paid = as.factor(apc_has_been_paid),
tier = as.factor(tier),
oa_color_finale = as.factor(oa_color_finale),
# On modifie les valeurs des catégories pour éviter les conflits de nom de classes dans les modélisations
is_french_CA_bso_wos_openalex_single_lang = as.factor(case_when(is_french_CA_bso_wos_openalex_single_lang == 1 ~ "french CA",
is_french_CA_bso_wos_openalex_single_lang == 0 ~ "non french CA")),
is_covered_by_couperin = as.factor(case_when(is_covered_by_couperin == 1 ~ "covered",
is_covered_by_couperin == 0 ~ "non covered")))
# Préparation du jeu de données prêt à modéliser
bso_pop <- bso_mvp %>%
filter(!is.na(bso_classification),
!is.na(oa_color_finale),
!is.na(is_french_CA_bso_wos_openalex_single_lang),
!is.na(amount_apc_EUR),
!is.na(is_covered_by_couperin)) %>%
filter(apc_has_been_paid == 1, amount_apc_EUR > 0)
# Fonction pour sauvegarder les viz au format SVG
saving_plot <- function(name, graph) {
ggsave(file = glue("../figures/SVG_modelisations/{name}.svg"), plot=graph, width=10, height=6)
}
table_y1_class <- bso_pop %>% group_by(year, bso_classification) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup()
# Plot
graph <- ggplot(table_y1_class, aes(x = year, y = montant_moyen, group = bso_classification, colour = bso_classification)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
labs(y = "Average APC", title = "Montants moyens des APC payés par discipline et par année", colour = "") +
scale_color_manual(values = c("#025bbb", "#9d6939", "#3aaf29", "#f1e401", "#fcb2b8", "#fb6d35", "#284803", "#852895", "#919598", "#24c0e8"),
breaks = c("Biology (fond.)", "Medical research", "Chemistry", "Earth, Ecology, \nEnergy and applied biology", "Social sciences", "Computer and \n information sciences", "Physical sciences, Astronomy", "Engineering", "Mathematics", "Humanities")) +
theme_classic() +
theme(legend.position = "right",
axis.title.x = element_blank())
graph
saving_plot("1.explo_APC-paid_discipline", graph)

table_y1_tier <- bso_pop %>% group_by(year, tier) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>%
mutate(tier = as.character(paste("tier", tier, sep = " ")))
# Plot
graph <- ggplot(table_y1_tier, aes(x = year, y = montant_moyen, group = tier, colour = tier)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(y = "Average APC", title = "Montants moyens des APC payés par tier et par année", color = "") +
theme_classic() +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099", "#0072B2", "#D55E00")) +
theme(legend.position = "right",
axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y1_tier %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("2.explo_APC-paid_tier", graph)

table_y1_ca <- bso_pop %>%
group_by(year, is_french_CA_bso_wos_openalex_single_lang) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>%
mutate(is_french_CA = as.character(is_french_CA_bso_wos_openalex_single_lang))
# Plot
graph <- ggplot(table_y1_ca, aes(x = year, y = montant_moyen, group = is_french_CA, colour = is_french_CA)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si le CA est français") +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("non french CA", "french CA")) +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y1_ca %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("3.explo_APC-paid_frenchCA", graph)

table_y1_color <- bso_pop %>% group_by(year, oa_color_finale) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>% mutate(oa_color_finale = as.character(oa_color_finale))
# Plot
graph <- ggplot(table_y1_color, aes(x = year, y = montant_moyen, group = oa_color_finale, colour = oa_color_finale)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par couleur OA et par année") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#0072B2", "#FFCC00"),
breaks = c("hybridOA", "gold")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y1_color %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("4.explo_APC-paid_OAcolor", graph)

table_y1_coup <- bso_pop %>% group_by(year, is_covered_by_couperin) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>%
mutate(is_covered_by_couperin = case_when(is_covered_by_couperin == "covered" ~ "covered by couperin",
is_covered_by_couperin == "non covered" ~ "not covered by couperin"))
# Plot
graph <- ggplot(table_y1_coup, aes(x = year, y = montant_moyen, group = is_covered_by_couperin, colour = is_covered_by_couperin)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si l'article est couvert par les accords Couperin") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("covered by couperin", "not covered by couperin")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y1_coup %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("5.explo_APC-paid_covered", graph)

table_y2_class <- bso_pop %>% group_by(year, bso_classification) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup()
# Plot
graph <- ggplot(table_y2_class, aes(x = year, y = montant_moyen, group = bso_classification, colour = bso_classification)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
labs(y = "Average APC", title = "Montants moyens des APC payés par discipline et par année", colour = "") +
scale_color_manual(values = c("#025bbb", "#9d6939", "#3aaf29", "#f1e401", "#284803", "#fcb2b8", "#fb6d35", "#852895", "#24c0e8", "#919598"),
breaks = c("Biology (fond.)", "Medical research", "Chemistry", "Earth, Ecology, \nEnergy and applied biology", "Physical sciences, Astronomy", "Social sciences", "Computer and \n information sciences", "Engineering", "Humanities", "Mathematics")) +
theme_classic() +
theme(legend.position = "right",
axis.title.x = element_blank())
graph
saving_plot("6.explo_APC-paid_discipline_trusted", graph)

table_y2_tier <- bso_pop %>% group_by(year, tier) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>%
mutate(tier = as.character(paste("tier", tier, sep = " ")))
# Plot
graph <- ggplot(table_y2_tier, aes(x = year, y = montant_moyen, group = tier, colour = tier)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(y = "Average APC", title = "Montants moyens des APC payés par tier et par année", color = "") +
theme_classic() +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099", "#0072B2", "#D55E00")) +
theme(legend.position = "right",
axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y2_tier %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("7.explo_APC-paid_tier_trusted", graph)

table_y2_ca <- bso_pop %>%
group_by(year, is_french_CA_bso_wos_openalex_single_lang) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>%
mutate(is_french_CA = as.character(is_french_CA_bso_wos_openalex_single_lang))
# Plot
graph <- ggplot(table_y2_ca, aes(x = year, y = montant_moyen, group = is_french_CA, colour = is_french_CA)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si le CA est français") +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("non french CA", "french CA")) +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y2_ca %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("8.explo_APC-paid_frenchCA_trusted", graph)

table_y2_color <- bso_pop %>% group_by(year, oa_color_finale) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>% mutate(oa_color_finale = as.character(oa_color_finale))
# Plot
graph <- ggplot(table_y2_color, aes(x = year, y = montant_moyen, group = oa_color_finale, colour = oa_color_finale)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par couleur OA et par année") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#0072B2", "#FFCC00"),
breaks = c("hybridOA", "gold")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y2_color %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("9.explo_APC-paid_OAcolor_trusted", graph)

table_y2_coup <- bso_pop %>% group_by(year, is_covered_by_couperin) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>%
mutate(is_covered_by_couperin = case_when(is_covered_by_couperin == "covered" ~ "covered by couperin",
is_covered_by_couperin == "non covered" ~ "not covered by couperin"))
# Plot
graph <- ggplot(table_y2_coup, aes(x = year, y = montant_moyen, group = is_covered_by_couperin, colour = is_covered_by_couperin)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si l'article est couvert par les accords Couperin") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("covered by couperin", "not covered by couperin")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table_y2_coup %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("10.explo_APC-paid_covered_trusted", graph)

Au vu des graphiques présentés ci-dessus, nous décidons de ne modéliser que les articles pour lesquels le montant des APC est fiable. Nous supprimons donc la variable des APC du BSO puis filtrons sur les articles ayant un APC fiable connu.
n_old <- nrow(bso_pop)
bso_pop <- bso_pop %>% select(-amount_apc_EUR) %>% filter(!is.na(amount_apc_EUR_trusted))
Le nombre d’articles dans notre base finale pour les modélisations est ainsi de 119616. En ne gardant que les sources d’APC fiables, 28554 articles ont été écartés, soit 19.27% des articles avec des APC payés et des valeurs connues pour les autres variables.
amBoxplot(bso_pop$amount_apc_EUR_trusted, xlab=" ", ylab="Montant des APC fiables", main="Boxplot de Y")
L’étendue (écart entre les valeurs minimales et maximales) du montant des APC est importante, avec des valeurs allant de 2.27 à 9848.4 euros. Nous constatons 2 valeurs extrêmes qui se détachent largement des autres sur le boxplot, où les montants sont respectivement de 9848.4 et 9028.43 euros. Après vérification à partir de l’ISSN, les valeurs sont plausibles donc nous décidons de laisser ces valeurs aux données à modéliser.
# Normalité de Y
mean <- mean(bso_pop$amount_apc_EUR_trusted, na.rm = TRUE)
sd <- sd(bso_pop$amount_apc_EUR_trusted, na.rm = TRUE)
ggplot(bso_pop, aes(bso_pop$amount_apc_EUR_trusted)) +
geom_histogram(aes(y=..density..), color="black", fill = "steelblue", alpha = 0.2) +
geom_density( color="red", size = 1, adjust = 5) +
stat_function(fun = dnorm, colour = "Black", size = 1, args = list(mean = mean, sd = sd)) +
xlim(c(min(bso_pop$amount_apc_EUR_trusted)-1, max(bso_pop$amount_apc_EUR_trusted)+1))+ ggtitle("Distribution du montant des APC fiables") +
xlab("Montant des APC fiables") + ylab("Densité") +
theme_classic()
# Test statistique de normalité
out <- JarqueBeraTest(bso_pop$amount_apc_EUR_trusted, robust = FALSE, method = "chisq") # Y non normal (on rejette H0)
library(formattable)
df <- data.frame(value = unlist(out))
tdf <- as.data.frame(t(df))
formattable(tdf)
| statistic.X-squared | parameter.df | p.value | method | data.name | |
|---|---|---|---|---|---|
| value | 49797.6693637252 | 2 | 0 | Jarque Bera Test | bso_pop$amount_apc_EUR_trusted |

Le montant des APC fiables n’est pas normalement distribué.
# Fonction pour sauvegarder les résultats des tests
save_results <- function(output, num){
df <- data.frame(value = unlist(output))
tdf <- as.data.frame(t(df))
assign(glue("tdf_{num}"), tdf[,1:5], envir = .GlobalEnv)
}
# Test de corrélation (Spearman car distribution non normale)
output <- cor.test(bso_pop$amount_apc_EUR_trusted, bso_pop$year, method = "spearman") # corrélation significative
df <- data.frame(value = unlist(output))
tdf <- as.data.frame(t(df))
tdf <- tdf %>% mutate(data.name = str_replace_all(data.name, "_", "\\_"), #escape special characters
data.name = gsub("bso_pop\\$", "", data.name))
formattable(tdf)
| statistic.S | p.value | estimate.rho | null.value.rho | alternative | method | data.name | |
|---|---|---|---|---|---|---|---|
| value | 242633003532012 | 0 | 0.149384487346854 | 0 | two.sided | Spearman’s rank correlation rho | amount_apc_EUR_trusted and year |
# Tests de dépendance
# Année
save_results(chisq.test(bso_pop$year, bso_pop$bso_classification), "1")
save_results(chisq.test(bso_pop$year, bso_pop$tier), "2")
save_results(chisq.test(bso_pop$year, bso_pop$oa_color_finale), "3")
save_results(chisq.test(bso_pop$year, bso_pop$is_french_CA_bso_wos_openalex_single_lang), "4")
save_results(chisq.test(bso_pop$year, bso_pop$is_covered_by_couperin), "5")
# Discipline
save_results(chisq.test(bso_pop$bso_classification, bso_pop$tier), "6")
save_results(chisq.test(bso_pop$bso_classification, bso_pop$oa_color_finale), "7")
save_results(chisq.test(bso_pop$bso_classification, bso_pop$is_french_CA_bso_wos_openalex_single_lang), "8")
save_results(chisq.test(bso_pop$bso_classification, bso_pop$is_covered_by_couperin), "9")
# Tier
save_results(chisq.test(bso_pop$tier, bso_pop$oa_color_finale), "10")
save_results(chisq.test(bso_pop$tier, bso_pop$is_french_CA_bso_wos_openalex_single_lang), "11")
save_results(chisq.test(bso_pop$tier, bso_pop$is_covered_by_couperin), "12")
# Couleur OA
save_results(chisq.test(bso_pop$oa_color_finale, bso_pop$is_french_CA_bso_wos_openalex_single_lang), "13")
save_results(chisq.test(bso_pop$oa_color_finale, bso_pop$is_covered_by_couperin), "14")
# French CA
save_results(chisq.test(bso_pop$is_french_CA_bso_wos_openalex_single_lang, bso_pop$is_covered_by_couperin), "15") #toutes dépendantes
# Présentation des résultats dans une table
output <- rbind(tdf_1,tdf_2,tdf_3,tdf_4,tdf_5,tdf_6,tdf_7,tdf_8,tdf_9,tdf_10,tdf_11,tdf_12,tdf_13,tdf_14,tdf_15) %>%
mutate(p.value = as.numeric(p.value),
Dependance = case_when(p.value <= 0.05 ~ "Oui",
TRUE ~ "Non"))
# Escape special characters
output <- output %>% mutate(data.name = str_replace_all(data.name, "_", "\\_"), #escape special characters
data.name = gsub("bso_pop\\$", "", data.name)) #remove base name
rownames(output) <- NULL #remove rownames
# Display table
kable(output, format = "html", caption = "Résultats des tests de dépendance entre les variables qualitatives") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
html_font = "sans-serif")
| statistic.X-squared | parameter.df | p.value | method | data.name | Dependance |
|---|---|---|---|---|---|
| 1844.04364295786 | 63 | 0 | Pearson’s Chi-squared test | year and bso_classification | Oui |
| 10564.3561750721 | 21 | 0 | Pearson’s Chi-squared test | year and tier | Oui |
| 89.5375126042915 | 7 | 0 | Pearson’s Chi-squared test | year and oa_color_finale | Oui |
| 207.441377682062 | 7 | 0 | Pearson’s Chi-squared test | year and is_french_CA_bso_wos_openalex_single_lang | Oui |
| 155.842642000071 | 7 | 0 | Pearson’s Chi-squared test | year and is_covered_by_couperin | Oui |
| 7903.46367919093 | 27 | 0 | Pearson’s Chi-squared test | bso_classification and tier | Oui |
| 4498.17074461241 | 9 | 0 | Pearson’s Chi-squared test | bso_classification and oa_color_finale | Oui |
| 223.795592556606 | 9 | 0 | Pearson’s Chi-squared test | bso_classification and is_french_CA_bso_wos_openalex_single_lang | Oui |
| 13109.2394339774 | 9 | 0 | Pearson’s Chi-squared test | bso_classification and is_covered_by_couperin | Oui |
| 11883.1772351368 | 3 | 0 | Pearson’s Chi-squared test | tier and oa_color_finale | Oui |
| 228.437752033453 | 3 | 0 | Pearson’s Chi-squared test | tier and is_french_CA_bso_wos_openalex_single_lang | Oui |
| 15863.9058727782 | 3 | 0 | Pearson’s Chi-squared test | tier and is_covered_by_couperin | Oui |
| 1870.71182474937 | 1 | 0 | Pearson’s Chi-squared test with Yates’ continuity correction | oa_color_finale and is_french_CA_bso_wos_openalex_single_lang | Oui |
| 49839.9633692406 | 1 | 0 | Pearson’s Chi-squared test with Yates’ continuity correction | oa_color_finale and is_covered_by_couperin | Oui |
| 1145.55413773546 | 1 | 0 | Pearson’s Chi-squared test with Yates’ continuity correction | is_french_CA_bso_wos_openalex_single_lang and is_covered_by_couperin | Oui |
# Vérifier :
#- homoscédasticité (homogénéité de la variance au sein des sous-pops)
#- indépendance (sous-pops doivent avoir un effet aléatoire sur Y)
#- autocorrélation (résidus et erreurs ne doivent pas être auto-correlés)
# Transformation de la variable année
bso_pop <- bso_pop %>%
mutate(year = year - 2013)
# Modèle linéaire
lm1 <- lm(amount_apc_EUR_trusted ~ year, data = bso_pop)
# Summary du modèle
summary <- tidy(lm1)
kable(summary, format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
html_font = "sans-serif")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1633.06034 | 5.168348 | 315.97340 | 0 |
| year | 39.79084 | 1.063911 | 37.40053 | 0 |
# Y estimé
prediction_lm1 <- round(predict(lm1),2)
# Mesures d'erreurs
mae_lm1 <- mean(abs(prediction_lm1 - bso_pop$amount_apc_EUR_trusted))
mdae_lm1 <- median(abs(prediction_lm1 - bso_pop$amount_apc_EUR_trusted))
sae_lm1 <- sum(abs(prediction_lm1 - bso_pop$amount_apc_EUR_trusted))
Chaque année, le montant des APC (fiables) augmente de 39.79 euros en moyenne.
# Modèle linéaire
lm2 <- lm(amount_apc_EUR_trusted ~ year + tier + oa_color_finale + is_covered_by_couperin + is_french_CA_bso_wos_openalex_single_lang + bso_classification, data = bso_pop)
# Summary du modèle
summary <- tidy(lm2)
kable(summary, format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
html_font = "sans-serif")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1949.00037 | 11.2763319 | 172.839925 | 0.0e+00 |
| year | 36.92390 | 0.9814459 | 37.621943 | 0.0e+00 |
| tier2 | -271.02121 | 8.5219874 | -31.802583 | 0.0e+00 |
| tier3 | -401.64620 | 8.4729780 | -47.403191 | 0.0e+00 |
| tier4 | -349.58549 | 9.0370551 | -38.683563 | 0.0e+00 |
| oa_color_finalehybridOA | 763.45374 | 7.1434349 | 106.874879 | 0.0e+00 |
| is_covered_by_couperinnon covered | -72.01286 | 8.4917775 | -8.480304 | 0.0e+00 |
| is_french_CA_bso_wos_openalex_single_langnon french CA | 71.06395 | 4.2454420 | 16.738881 | 0.0e+00 |
| bso_classificationChemistry | -404.66028 | 12.7847102 | -31.651893 | 0.0e+00 |
| bso_classificationComputer and information sciences | -478.30548 | 16.1292903 | -29.654466 | 0.0e+00 |
| bso_classificationEarth, Ecology, Energy and applied biology | -287.59635 | 7.6822988 | -37.436236 | 0.0e+00 |
| bso_classificationEngineering | -587.72553 | 21.7926464 | -26.968984 | 0.0e+00 |
| bso_classificationHumanities | -881.87264 | 27.4958951 | -32.072884 | 0.0e+00 |
| bso_classificationMathematics | -1010.07039 | 28.5262179 | -35.408493 | 0.0e+00 |
| bso_classificationMedical research | -23.60761 | 4.8664055 | -4.851138 | 1.2e-06 |
| bso_classificationPhysical sciences, Astronomy | -404.98640 | 8.6786828 | -46.664500 | 0.0e+00 |
| bso_classificationSocial sciences | -470.37703 | 27.9288201 | -16.841994 | 0.0e+00 |
# Analyse des résidus
lm2_residu <- round(residuals(lm2),2)
# Y estimé
prediction_lm2 <- round(predict(lm2),2)
# Mesures d'erreurs
mae_lm2 <- mean(abs(prediction_lm2 - bso_pop$amount_apc_EUR_trusted))
mdae_lm2 <- median(abs(prediction_lm2 - bso_pop$amount_apc_EUR_trusted))
sae_lm2 <- sum(abs(prediction_lm2 - bso_pop$amount_apc_EUR_trusted))
Chaque discipline dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression.
## Modèle multi-niveaux
mm1 <- lmer(amount_apc_EUR_trusted ~ year + (1 | bso_classification), data = bso_pop)
# Summary du modèle
print_summary <- function(model, name){
summary <- tidy(model)
print(kable(summary, format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
html_font = "sans-serif"))
# Y estimé
prediction <- predict(model)
# Mesures d'erreurs
mae <- mean(abs(prediction - bso_pop$amount_apc_EUR_trusted))
mdae <- median(abs(prediction - bso_pop$amount_apc_EUR_trusted))
sae <- sum(abs(prediction - bso_pop$amount_apc_EUR_trusted))
# Sauvegarde dans l'environnement
assign(glue("summary_{name}"), summary, envir = .GlobalEnv)
assign(glue("prediction_{name}"), prediction, envir = .GlobalEnv)
assign(glue("mae_{name}"), mae, envir = .GlobalEnv)
assign(glue("mdae_{name}"), mdae, envir = .GlobalEnv)
assign(glue("sae_{name}"), sae, envir = .GlobalEnv)
}
print_summary(mm1, "mm1")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1305.66655 | 104.566292 | 12.48650 |
| fixed | NA | year | 42.21473 | 1.051654 | 40.14127 |
| ran_pars | bso_classification | sd__(Intercept) | 329.72755 | NA | NA |
| ran_pars | Residual | sd__Observation | 805.72453 | NA | NA |
# Sauvegarde les paramètres du modèle
intercept <- summary_mm1 %>% filter(term == "(Intercept)") %>% select(estimate)
# Visualisation des résultats
tidy(mm1, effects = "ran_vals") %>% # effets aléatoires
mutate(estimate = intercept$estimate + estimate, # ajout de l'ordonnée à l'origine
ymin = estimate - 2 * `std.error`, # intervalles à 95%
ymax = estimate + 2 * `std.error`) %>%
arrange(estimate) %>%
mutate(level = as_factor(level)) %>%
ggplot(aes(x = level, y = estimate)) +
ylab("Intercepts estimated") +
geom_pointrange(aes(ymin = ymin, ymax = ymax)) +
geom_hline(yintercept = intercept$estimate, linetype = 2) +
coord_flip()

Chaque discipline dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par discipline.
## Modèle multi-niveaux
mm2 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | bso_classification), data = bso_pop)
# Summary du modèle
print_summary(mm2, "mm2")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1430.33224 | 101.49955 | 14.092006 |
| fixed | NA | year | 15.13775 | 10.39035 | 1.456905 |
| ran_pars | bso_classification | sd__(Intercept) | 318.00774 | NA | NA |
| ran_pars | bso_classification | cor__(Intercept).year | -0.06309 | NA | NA |
| ran_pars | bso_classification | sd__year | 31.66700 | NA | NA |
| ran_pars | Residual | sd__Observation | 803.31022 | NA | NA |
library(grid)
library(gtable)
viz_intercept_trend <- function(model, name, summary){
# Sauvegarde les paramètres du modèle
intercept <- summary %>% filter(term == "(Intercept)") %>% select(estimate)
coef_trend <- summary %>% filter(term == "year") %>% select(estimate)
# modèle entier
tidy_model_intercept <- tidy(model, effects = "ran_vals") %>% arrange(estimate) %>% filter(term %in% "(Intercept)") %>% mutate(level = as_factor(level))
# Visualisation des résultats
## Ordonnées à l'origine
graph_intercept <- tidy_model_intercept %>%
mutate(estimate = intercept$estimate + estimate, # ajout de l'ordonnée à l'origine
ymin = estimate - 2 * `std.error`, # intervalles à 95%
ymax = estimate + 2 * `std.error`) %>%
ggplot(aes(x = level, y = estimate)) +
ylab("Intercepts estimated") +
geom_pointrange(aes(ymin = ymin, ymax = ymax)) +
geom_hline(yintercept = intercept$estimate, linetype = 2) +
coord_flip()
## On récupère l'ordre des niveaux pour l'injecter au deuxième graph
order_level <- tidy_model_intercept %>% select(level) %>% t()
## Coefficients des pentes
graph_trend <- tidy(model, effects = "ran_vals") %>% # effets aléatoires
filter(term %in% "year") %>% # pentes
mutate(level = factor(level, order = TRUE, levels = order_level)) %>%
mutate(estimate = coef_trend$estimate + estimate, # ajout de l'ordonnée à l'origine
ymin = estimate - 2 * `std.error`, # intervalles à 95%
ymax = estimate + 2 * `std.error`) %>%
ggplot(aes(x = level, y = estimate)) +
ylab("Slope coefficients estimated") +
geom_pointrange(aes(ymin = ymin, ymax = ymax)) +
geom_hline(yintercept = coef_trend$estimate, linetype = 2) +
coord_flip() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank())
grid.newpage()
grid.draw(cbind(ggplotGrob(graph_intercept), ggplotGrob(graph_trend), size="last"))
}
viz_intercept_trend(mm2, "mm2", summary_mm2)

Chaque tier dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par tier.
## Modèle multi-niveaux
mm3 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | tier), data = bso_pop)
# Summary du modèle
print_summary(mm3, "mm3")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1821.7423612 | 324.40976 | 5.6155597 |
| fixed | NA | year | 23.2517337 | 25.46493 | 0.9130884 |
| ran_pars | tier | sd__(Intercept) | 648.7117385 | NA | NA |
| ran_pars | tier | cor__(Intercept).year | -0.9085942 | NA | NA |
| ran_pars | tier | sd__year | 50.8713498 | NA | NA |
| ran_pars | Residual | sd__Observation | 792.1446766 | NA | NA |
viz_intercept_trend(mm3, "mm3", summary_mm3)

Chaque groupe d’articles selon qu’ils soient couvert par les accords Couperin dispose de son ordonnée à l’origine, et de son coefficient de pente de régression impacté par l’année.
## Modèle multi-niveaux
mm4 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | is_covered_by_couperin), data = bso_pop)
# Summary du modèle
print_summary(mm4, "mm4")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1887.64180 | 344.18858 | 5.484324 |
| fixed | NA | year | 30.58142 | 10.40395 | 2.939406 |
| ran_pars | is_covered_by_couperin | sd__(Intercept) | 486.68880 | NA | NA |
| ran_pars | is_covered_by_couperin | cor__(Intercept).year | -1.00000 | NA | NA |
| ran_pars | is_covered_by_couperin | sd__year | 14.64059 | NA | NA |
| ran_pars | Residual | sd__Observation | 792.57366 | NA | NA |
viz_intercept_trend(mm4, "mm4", summary_mm4)

Chaque couleur OA dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par couleur OA.
## Modèle multi-niveaux
mm5 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | oa_color_finale), data = bso_pop)
# Summary du modèle
print_summary(mm5, "mm5")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1952.36971 | 438.95248 | 4.447793 |
| fixed | NA | year | 25.95908 | 17.97977 | 1.443794 |
| ran_pars | oa_color_finale | sd__(Intercept) | 620.72870 | NA | NA |
| ran_pars | oa_color_finale | cor__(Intercept).year | -1.00000 | NA | NA |
| ran_pars | oa_color_finale | sd__year | 25.38926 | NA | NA |
| ran_pars | Residual | sd__Observation | 753.27830 | NA | NA |
viz_intercept_trend(mm5, "mm5", summary_mm5)

Chaque groupe d’articles selon que l’auteur correspondant soit français dispose de son ordonnée à l’origine, et de son coefficient de pente de régression impacté par l’année.
## Modèle multi-niveaux
mm6 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | is_french_CA_bso_wos_openalex_single_lang), data = bso_pop)
# Summary du modèle
print_summary(mm6, "mm6")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1648.06453 | 164.325970 | 10.029240 |
| fixed | NA | year | 38.07521 | 5.932018 | 6.418592 |
| ran_pars | is_french_CA_bso_wos_openalex_single_lang | sd__(Intercept) | 232.27695 | NA | NA |
| ran_pars | is_french_CA_bso_wos_openalex_single_lang | cor__(Intercept).year | -1.00000 | NA | NA |
| ran_pars | is_french_CA_bso_wos_openalex_single_lang | sd__year | 8.25383 | NA | NA |
| ran_pars | Residual | sd__Observation | 816.34946 | NA | NA |
viz_intercept_trend(mm6, "mm6", summary_mm6)

# On rassemble les prévisions
fitted <- bso_pop %>% select(year, amount_apc_EUR_trusted) %>% mutate(year = year + 2013)
fitted <- cbind(fitted, prediction_lm1, prediction_lm2, prediction_mm1, prediction_mm2, prediction_mm3, prediction_mm4, prediction_mm5, prediction_mm6)
# Visualisation
graph <- fitted %>% group_by(year) %>% summarise_all(sum) %>%
ggplot(aes(x = year)) +
geom_line(aes(y = amount_apc_EUR_trusted, color = "observed values")) +
geom_point(aes(y = amount_apc_EUR_trusted), col = "red", size = 1.2) +
geom_line(aes(y = prediction_lm1, color = "lm1")) +
geom_line(aes(y = prediction_lm2, color = "lm2")) +
geom_line(aes(y = prediction_mm1, color = "mm1")) +
geom_line(aes(y = prediction_mm2, color = "mm2")) +
geom_line(aes(y = prediction_mm3, color = "mm3")) +
geom_line(aes(y = prediction_mm4, color = "mm4")) +
geom_line(aes(y = prediction_mm5, color = "mm5")) +
geom_line(aes(y = prediction_mm6, color = "mm6")) +
scale_color_manual(values = c('observed values' = "red",
"lm1" = "#999999",
"lm2" = "#E69F00",
"mm1" = "#56B4E9",
"mm2" = "#009E73",
"mm3" = "#000000",
"mm4" = "#0072B2",
"mm5" = "#D55E00",
"mm6" = "#CC79A7"
)) +
labs(y = "Total cost of APCs", title = "Somme des APC estimés par les modèles, par année", colour = " ") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
theme_classic() +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(aes(y = amount_apc_EUR_trusted, label = paste(" ", format(round(amount_apc_EUR_trusted / 1e6, 1), trim = TRUE), "M€")),
color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("11.models_APC-paid_sum", graph)

# Visualisation
graph <- fitted %>% group_by(year) %>% summarise_all(.funs = c(mean="mean")) %>% mutate(year = as.numeric(as.character(year))) %>%
ggplot(aes(x = year)) +
geom_line(aes(y = amount_apc_EUR_trusted_mean, color = "observed values")) +
geom_point(aes(y = amount_apc_EUR_trusted_mean), col = "red", size = 1.2) +
geom_line(aes(y = prediction_lm1_mean, color = "lm1")) +
geom_line(aes(y = prediction_lm2_mean, color = "lm2")) +
geom_line(aes(y = prediction_mm1_mean, color = "mm1")) +
geom_line(aes(y = prediction_mm2_mean, color = "mm2")) +
geom_line(aes(y = prediction_mm3_mean, color = "mm3")) +
geom_line(aes(y = prediction_mm4_mean, color = "mm4")) +
geom_line(aes(y = prediction_mm5_mean, color = "mm5")) +
geom_line(aes(y = prediction_mm6_mean, color = "mm6")) +
scale_color_manual(values = c('observed values' = "red",
"lm1" = "#999999",
"lm2" = "#E69F00",
"mm1" = "#56B4E9",
"mm2" = "#009E73",
"mm3" = "#000000",
"mm4" = "#0072B2",
"mm5" = "#D55E00",
"mm6" = "#CC79A7"
)) +
labs(color = "", y = "Average APC", title = "Moyenne des APC estimés par les modèles, par année", colour = "") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
theme_classic() +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(aes(y = amount_apc_EUR_trusted_mean, label = paste(" ", round(amount_apc_EUR_trusted_mean, 0))), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("12.models_APC-paid_mean", graph)

Les graphiques représentant les APC cumulés et moyens montrent tous deux que la meilleure estimation (la courbe s’approchant le plus des valeurs observées en rouge) est le troisième modèle multi-niveaux, “mm3”. Il s’agit du modèle où la source de variation est le tier.
# Tableau récapitulatif
recapitulatif <- data.frame(
fix.ef = c("year",
"year + tier + oa_color_finale + is_covered_by_couperin + is_french_CA_bso_wos_openalex_single_lang + bso_classification",
"year",
"year",
"year",
"year",
"year",
"year"),
ran.ef = c("",
"",
"bso_classification",
"bso_classification",
"tier",
"is_covered_by_couperin",
"oa_color_finale",
"is_french_CA_bso_wos_openalex_single_lang"),
AIC = c(AIC(lm1), AIC(lm2), AIC(mm1), AIC(mm2), AIC(mm3), AIC(mm4), AIC(mm5), AIC(mm6)),
BIC = c(BIC(lm1), BIC(lm2), BIC(mm1), BIC(mm2), BIC(mm3), BIC(mm4), BIC(mm5), BIC(mm6)),
MAE = c(mae_lm1, mae_lm2, mae_mm1, mae_mm2, mae_mm3, mae_mm4, mae_mm5, mae_mm6),
MDAE = c(mdae_lm1, mdae_lm2, mdae_mm1, mdae_mm2, mdae_mm3, mdae_mm4, mdae_mm5, mdae_mm6),
SAE = c(sae_lm1, sae_lm2, sae_mm1, sae_mm2, sae_mm3, sae_mm4, sae_mm5, sae_mm6))
# Table finale
recapitulatif <- recapitulatif %>% mutate_if(is.numeric, round, digits = 0)
recapitulatif %>% DT::datatable(options = list(searching = F), width = '60%')
La variance peut différer d’un groupe à un autre et il convient de le vérifier statistiquement pour le prendre en compte si besoin. Le test de Bartlett ci-dessous teste l’homogénéité de la variance comme hypothèse nulle, nous l’appliquons à la variable de l’année ‘year’ et regardons son résultat :
# Test d'homogénéité de la variance (H0)
output <- bartlett.test(amount_apc_EUR_trusted ~ interaction(year), data = bso_pop) #hétérogène
# Présentation des résultats
df <- data.frame(value = unlist(output))
tdf <- as.data.frame(t(df))
tdf <- tdf %>% mutate(data.name = str_replace_all(data.name, "_", "\\_"), #escape special characters
data.name = gsub("bso_pop\\$", "", data.name))
formattable(tdf)
| statistic.Bartlett’s K-squared | parameter.df | p.value | data.name | method | |
|---|---|---|---|---|---|
| value | 1305.81315717348 | 7 | 9.19370755033575e-278 | amount_apc_EUR_trusted by interaction(year) | Bartlett test of homogeneity of variances |
La p-value étant très proche de zéro nous rejetons l’hypothèse nulle : la variance est hétérogène. Dans les modèles suivants nous prenons en compte cette hétérogénéité de la variance et les comparons aux modélisations précédentes à l’aide d’une analyse de variance ANOVA.
Nous construisons un modèle en ajoutant des poids (weights) pouvant varier d’une année à l’autre, pour un effet aléatoire par tier d’éditeurs (modèle le plus performant estimé précédemment).
## Modèle multi-niveaux
library(nlme)
hmm1 <- lme(amount_apc_EUR_trusted ~ year,
data = bso_pop,
random = ~ 1 + year | tier,
weights = varIdent(form = ~ 1 | year),
control =list(msMaxIter = 1000, msMaxEval = 1000))
# Summary du modèle
print_summary(hmm1, "hmm1")
| effect | group | term | estimate | std.error | df | statistic | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1813.747058 | 234.56788 | 119611 | 7.732291 | 0.0000000 |
| fixed | NA | year | 24.371379 | 19.38816 | 119611 | 1.257024 | 0.2087474 |
| ran_pars | tier | sd_(Intercept) | 469.015194 | NA | NA | NA | NA |
| ran_pars | tier | cor_year.(Intercept) | -0.856127 | NA | NA | NA | NA |
| ran_pars | tier | sd_year | 38.710926 | NA | NA | NA | NA |
| ran_pars | Residual | sd_Observation | 624.928686 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 0 | 1.000000 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 1 | 1.080814 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 2 | 1.292266 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 3 | 1.263160 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 4 | 1.333781 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 5 | 1.376769 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 6 | 1.324006 | NA | NA | NA | NA |
| var_model | varIdent(1 | year) | 7 | 1.240774 | NA | NA | NA | NA |
# ANOVA mm3 et hmm1
anova <- anova(mm3, hmm1)
# Présentation des résultats
df <- data.frame(value = unlist(anova))
tdf <- as.data.frame(t(df))
formattable(tdf)
| npar | Sum Sq | Mean Sq | F value | |
|---|---|---|---|---|
| value | 1 | 523160.2 | 523160.2 | 0.8337304 |
#L'analyse de variance sous hypothèse nulle d'égalité de moyennes, montre qu'il n'y a pas de différence significative entre les deux modèles (p-value = ``r round(tdf$`Pr(>F)`, 2)``). Nous conservons donc le modèle initial présenté en partie précédente.
# Préparation du jeu de données prêt à modéliser
bso_frenchCA <- bso_mvp %>%
filter(!is.na(bso_classification),
!is.na(oa_color_finale),
!is.na(is_french_CA_bso_wos_openalex_single_lang),
!is.na(amount_apc_EUR),
!is.na(is_covered_by_couperin)) %>%
filter(is_french_CA_bso_wos_openalex_single_lang == "french CA", apc_has_been_paid == 1, amount_apc_EUR > 0)
Par rapport à la partie précédente, ici ne sont gardés que les articles ayant un auteur correspondant français. Il y a ainsi 81054 articles avec un CA français, pour lesquels on sait qu’un APC a été payé, et dont la discipline, le tier, la couleur et la couverture par Couperin sont connus (seuls 4 articles avec un CA français et des APC payés ont été mis de côté parce que l’une des variables citées étaient inconnues).
#detach(package:plyr)
table2_y1_class <- bso_frenchCA %>% group_by(year, bso_classification) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup()
# Plot
graph <- ggplot(table2_y1_class, aes(x = year, y = montant_moyen, group = bso_classification, colour = bso_classification)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
labs(y = "Average APC", title = "Montants moyens des APC payés par discipline et par année", colour = "") +
scale_color_manual(values = c("#025bbb", "#9d6939", "#3aaf29", "#fb6d35", "#fcb2b8", "#f1e401", "#284803", "#852895", "#24c0e8", "#919598"),
breaks = c("Biology (fond.)", "Medical research", "Chemistry", "Computer and \n information sciences", "Social sciences", "Earth, Ecology, \nEnergy and applied biology", "Physical sciences, Astronomy", "Engineering", "Humanities", "Mathematics")) +
theme_classic() +
theme(legend.position = "right",
axis.title.x = element_blank())
graph
saving_plot("13.explo_french-CA_discipline", graph)

table2_y1_tier <- bso_frenchCA %>% group_by(year, tier) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>%
mutate(tier = as.character(paste("tier", tier, sep = " ")))
# Plot
graph <- ggplot(table2_y1_tier, aes(x = year, y = montant_moyen, group = tier, colour = tier)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(y = "Average APC", title = "Montants moyens des APC payés par tier et par année", color = "") +
theme_classic() +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099", "#0072B2", "#D55E00")) +
theme(legend.position = "right",
axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y1_tier %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("14.explo_french-CA_tier", graph)

table2_y1_color <- bso_frenchCA %>% group_by(year, oa_color_finale) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>% mutate(oa_color_finale = as.character(oa_color_finale))
# Plot
graph <- ggplot(table2_y1_color, aes(x = year, y = montant_moyen, group = oa_color_finale, colour = oa_color_finale)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par couleur OA et par année") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#0072B2", "#FFCC00"),
breaks = c("hybridOA", "gold")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y1_color %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("15.explo_french-CA_OAcolor", graph)

table2_y1_coup <- bso_frenchCA %>% group_by(year, is_covered_by_couperin) %>%
summarise(montant_moyen = mean(amount_apc_EUR)) %>% ungroup() %>%
mutate(is_covered_by_couperin = case_when(is_covered_by_couperin == "covered" ~ "covered by couperin",
is_covered_by_couperin == "non covered" ~ "not covered by couperin"))
# Plot
graph <- ggplot(table2_y1_coup, aes(x = year, y = montant_moyen, group = is_covered_by_couperin, colour = is_covered_by_couperin)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si l'article est couvert par les accords Couperin") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("covered by couperin", "not covered by couperin")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y1_coup %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("16.explo_french-CA_covered", graph)

table2_y2_class <- bso_frenchCA %>% group_by(year, bso_classification) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup()
# Plot
graph <- ggplot(table2_y2_class, aes(x = year, y = montant_moyen, group = bso_classification, colour = bso_classification)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
labs(y = "Average APC", title = "Montants moyens des APC payés par discipline et par année", colour = "") +
scale_color_manual(values = c("#025bbb", "#9d6939", "#3aaf29", "#f1e401", "#284803", "#fcb2b8", "#fb6d35", "#852895", "#24c0e8", "#919598"),
breaks = c("Biology (fond.)", "Medical research", "Chemistry", "Earth, Ecology, \nEnergy and applied biology", "Physical sciences, Astronomy", "Social sciences", "Computer and \n information sciences", "Engineering", "Humanities", "Mathematics")) +
theme_classic() +
theme(legend.position = "right",
axis.title.x = element_blank())
graph
saving_plot("17.explo_french-CA_discipline_trusted", graph)

table2_y2_tier <- bso_frenchCA %>% group_by(year, tier) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>%
mutate(tier = as.character(paste("tier", tier, sep = " ")))
# Plot
graph <- ggplot(table2_y2_tier, aes(x = year, y = montant_moyen, group = tier, colour = tier)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(y = "Average APC", title = "Montants moyens des APC payés par tier et par année", color = "") +
theme_classic() +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099", "#0072B2", "#D55E00")) +
theme(legend.position = "right",
axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y2_tier %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("18.explo_french-CA_tier_trusted", graph)

table2_y2_color <- bso_frenchCA %>% group_by(year, oa_color_finale) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>% mutate(oa_color_finale = as.character(oa_color_finale))
# Plot
graph <- ggplot(table2_y2_color, aes(x = year, y = montant_moyen, group = oa_color_finale, colour = oa_color_finale)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par couleur OA et par année") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#0072B2", "#FFCC00"),
breaks = c("hybridOA", "gold")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y2_color %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("19.explo_french-CA_OAcolor_trusted", graph)

table2_y2_coup <- bso_frenchCA %>% group_by(year, is_covered_by_couperin) %>%
summarise(montant_moyen = mean(amount_apc_EUR_trusted, na.rm = T)) %>% ungroup() %>%
mutate(is_covered_by_couperin = case_when(is_covered_by_couperin == "covered" ~ "covered by couperin",
is_covered_by_couperin == "non covered" ~ "not covered by couperin"))
# Plot
graph <- ggplot(table2_y2_coup, aes(x = year, y = montant_moyen, group = is_covered_by_couperin, colour = is_covered_by_couperin)) +
geom_line(size=1, alpha=0.9, linetype=1) +
geom_point(colour="white", size = .6, pch = 21, stroke = 1.5) +
labs(color = "", y = "Average APC", title = "Montants moyens des APC payés par année, selon si l'article est couvert par les accords Couperin") +
theme_classic() +
scale_x_continuous(breaks = seq(2013, 2020, 1)) +
scale_color_manual(values = c("#009E73", "#660099"),
breaks = c("covered by couperin", "not covered by couperin")) +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(data = table2_y2_coup %>% filter(year %% 2 == 0),
aes(label = round(montant_moyen, 0)), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("20.explo_french-CA_covered_trusted", graph)

Ici encore en prenant comme population seulement les articles avec le CA français, nous décidons de modéliser le montant des APC fiable.
n_old <- nrow(bso_frenchCA)
bso_frenchCA <- bso_frenchCA %>% select(-amount_apc_EUR) %>% filter(!is.na(amount_apc_EUR_trusted))
Le nombre d’articles pour les modélisations est ainsi de 65785. En ne gardant que les sources d’APC fiables, 15269 articles ont été écartés, soit 18.84% des articles avec des APC payés, un CA français et des valeurs connues pour les autres variables.
# Test de corrélation (Spearman car distribution non normale)
output <- cor.test(bso_frenchCA$amount_apc_EUR_trusted, bso_frenchCA$year, method = "spearman") # corrélation significative
df <- data.frame(value = unlist(output))
tdf <- as.data.frame(t(df))
tdf <- tdf %>% mutate(data.name = str_replace_all(data.name, "_", "\\_"), #escape special characters
data.name = gsub("bso_frenchCA\\$", "", data.name))
formattable(tdf)
| statistic.S | p.value | estimate.rho | null.value.rho | alternative | method | data.name | |
|---|---|---|---|---|---|---|---|
| value | 39956541196575.7 | 0 | 0.15791001897175 | 0 | two.sided | Spearman’s rank correlation rho | amount_apc_EUR_trusted and year |
# Tests de dépendance
# Année
save_results(chisq.test(bso_frenchCA$year, bso_frenchCA$bso_classification), "1")
save_results(chisq.test(bso_frenchCA$year, bso_frenchCA$tier), "2")
save_results(chisq.test(bso_frenchCA$year, bso_frenchCA$oa_color_finale), "3")
save_results(chisq.test(bso_frenchCA$year, bso_frenchCA$is_covered_by_couperin), "4")
# Discipline
save_results(chisq.test(bso_frenchCA$bso_classification, bso_frenchCA$tier), "5")
save_results(chisq.test(bso_frenchCA$bso_classification, bso_frenchCA$oa_color_finale), "6")
save_results(chisq.test(bso_frenchCA$bso_classification, bso_frenchCA$is_covered_by_couperin), "7")
# Tier
save_results(chisq.test(bso_frenchCA$tier, bso_frenchCA$oa_color_finale), "8")
save_results(chisq.test(bso_frenchCA$tier, bso_frenchCA$is_covered_by_couperin), "9")
# Couleur OA
save_results(chisq.test(bso_frenchCA$oa_color_finale, bso_frenchCA$is_covered_by_couperin), "10")
# Présentation des résultats dans une table
output <- rbind(tdf_1,tdf_2,tdf_3,tdf_4,tdf_5,tdf_6,tdf_7,tdf_8,tdf_9,tdf_10) %>%
mutate(p.value = as.numeric(p.value),
Dependance = case_when(p.value <= 0.05 ~ "Oui",
TRUE ~ "Non"))
# Escape special characters
output <- output %>% mutate(data.name = str_replace_all(data.name, "_", "\\_"), #escape special characters
data.name = gsub("bso_frenchCA\\$", "", data.name)) #remove base name
rownames(output) <- NULL #remove rownames
# Display table
kable(output, format = "html", caption = "Résultats des tests de dépendance entre les variables qualitatives") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F,
html_font = "sans-serif")
| statistic.X-squared | parameter.df | p.value | method | data.name | Dependance |
|---|---|---|---|---|---|
| 1179.36582494436 | 63 | 0 | Pearson’s Chi-squared test | year and bso_classification | Oui |
| 6674.06953803312 | 21 | 0 | Pearson’s Chi-squared test | year and tier | Oui |
| 49.3016149041887 | 7 | 0 | Pearson’s Chi-squared test | year and oa_color_finale | Oui |
| 74.7630393039464 | 7 | 0 | Pearson’s Chi-squared test | year and is_covered_by_couperin | Oui |
| 4064.54084270166 | 27 | 0 | Pearson’s Chi-squared test | bso_classification and tier | Oui |
| 2327.25470251979 | 9 | 0 | Pearson’s Chi-squared test | bso_classification and oa_color_finale | Oui |
| 7596.56394938853 | 9 | 0 | Pearson’s Chi-squared test | bso_classification and is_covered_by_couperin | Oui |
| 5757.9702867729 | 3 | 0 | Pearson’s Chi-squared test | tier and oa_color_finale | Oui |
| 7564.31853293548 | 3 | 0 | Pearson’s Chi-squared test | tier and is_covered_by_couperin | Oui |
| 21632.3827056521 | 1 | 0 | Pearson’s Chi-squared test with Yates’ continuity correction | oa_color_finale and is_covered_by_couperin | Oui |
Ici encore il existe une forte dépendance entre les variables explicatives du modèle, à ne pas inclure dans une même modélisation donc.
# Normalité de Y
mean <- mean(bso_frenchCA$amount_apc_EUR_trusted, na.rm = TRUE)
sd <- sd(bso_frenchCA$amount_apc_EUR_trusted, na.rm = TRUE)
ggplot(bso_frenchCA, aes(bso_frenchCA$amount_apc_EUR_trusted)) +
geom_histogram(aes(y=..density..), color="black", fill = "steelblue", alpha = 0.2) +
geom_density( color="red", size = 1, adjust = 5) +
stat_function(fun = dnorm, colour = "Black", size = 1, args = list(mean = mean, sd = sd)) +
xlim(c(min(bso_frenchCA$amount_apc_EUR_trusted)-1, max(bso_frenchCA$amount_apc_EUR_trusted)+1))+ ggtitle("Distribution du montant des APC fiables") +
xlab("Montant des APC fiables") + ylab("Densité") +
theme_classic()
# Test statistique de normalité
out <- JarqueBeraTest(bso_frenchCA$amount_apc_EUR_trusted, robust = FALSE, method = "chisq") # Y non normal (on rejette H0)
library(formattable)
df <- data.frame(value = unlist(out))
tdf <- as.data.frame(t(df))
formattable(tdf)
| statistic.X-squared | parameter.df | p.value | method | data.name | |
|---|---|---|---|---|---|
| value | 36052.1176412677 | 2 | 0 | Jarque Bera Test | bso_frenchCA$amount_apc_EUR_trusted |

Le montant des APC fiables n’est pas normalement distribué pour les articles ayant un CA français et où des APC ont été payés.
Chaque discipline dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par discipline.
## Modèle multi-niveaux
bso_frenchCA <- bso_frenchCA %>% mutate(year = year - 2013)
mm2.1 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | bso_classification), data = bso_frenchCA)
# Summary du modèle
print_summary(mm2.1, "mm2.1")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1372.2979161 | 90.66522 | 15.135880 |
| fixed | NA | year | 14.0251387 | 11.13582 | 1.259461 |
| ran_pars | bso_classification | sd__(Intercept) | 281.5986411 | NA | NA |
| ran_pars | bso_classification | cor__(Intercept).year | 0.0652288 | NA | NA |
| ran_pars | bso_classification | sd__year | 33.4758250 | NA | NA |
| ran_pars | Residual | sd__Observation | 764.0597813 | NA | NA |
viz_intercept_trend(mm2.1, "mm2.1", summary_mm2.1)

Chaque tier dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par tier.
## Modèle multi-niveaux
mm2.2 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | tier), data = bso_frenchCA)
# Summary du modèle
print_summary(mm2.2, "mm2.2")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1732.9556270 | 212.67692 | 8.148301 |
| fixed | NA | year | 26.8037745 | 20.59414 | 1.301524 |
| ran_pars | tier | sd__(Intercept) | 425.0766712 | NA | NA |
| ran_pars | tier | cor__(Intercept).year | -0.9076197 | NA | NA |
| ran_pars | tier | sd__year | 41.0629758 | NA | NA |
| ran_pars | Residual | sd__Observation | 763.5601474 | NA | NA |
viz_intercept_trend(mm2.2, "mm2.2", summary_mm2.2)

Chaque groupe d’articles selon qu’ils soient couvert par les accords Couperin dispose de son ordonnée à l’origine, et de son coefficient de pente de régression impacté par l’année.
## Modèle multi-niveaux
mm2.3 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | is_covered_by_couperin), data = bso_frenchCA)
# Summary du modèle
print_summary(mm2.3, "mm2.3")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1812.33453 | 360.19889 | 5.031483 |
| fixed | NA | year | 30.25574 | 14.24999 | 2.123210 |
| ran_pars | is_covered_by_couperin | sd__(Intercept) | 509.27854 | NA | NA |
| ran_pars | is_covered_by_couperin | cor__(Intercept).year | -1.00000 | NA | NA |
| ran_pars | is_covered_by_couperin | sd__year | 20.06310 | NA | NA |
| ran_pars | Residual | sd__Observation | 764.77417 | NA | NA |
viz_intercept_trend(mm2.3, "mm2.3", summary_mm2.3)

Chaque couleur OA dispose de son ordonnée à l’origine, et l’année impacte l’angle de la pente de régression par couleur OA.
## Modèle multi-niveaux
mm2.4 <- lmer(amount_apc_EUR_trusted ~ year + (1 + year | oa_color_finale), data = bso_frenchCA)
# Summary du modèle
print_summary(mm2.4, "mm2.4")
| effect | group | term | estimate | std.error | statistic |
|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 1929.00036 | 398.59206 | 4.8395353 |
| fixed | NA | year | 17.50577 | 24.02843 | 0.7285438 |
| ran_pars | oa_color_finale | sd__(Intercept) | 563.59894 | NA | NA |
| ran_pars | oa_color_finale | cor__(Intercept).year | -1.00000 | NA | NA |
| ran_pars | oa_color_finale | sd__year | 33.93122 | NA | NA |
| ran_pars | Residual | sd__Observation | 737.07446 | NA | NA |
viz_intercept_trend(mm2.4, "mm2.4", summary_mm2.4)

On fait un simple modèle linéaire.
## Modèle multi-niveaux
lm2.1 <- lm(amount_apc_EUR_trusted ~ year, data = bso_frenchCA)
# Summary du modèle
print_summary(lm2.1, "lm2.1")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1561.70792 | 6.506397 | 240.02654 | 0 |
| year | 41.10933 | 1.357211 | 30.28958 | 0 |
# On rassemble les prévisions
fitted2 <- bso_frenchCA %>% select(year, amount_apc_EUR_trusted) %>% mutate(year = year + 2013)
fitted2 <- cbind(fitted2, prediction_mm2.1, prediction_mm2.2, prediction_mm2.3, prediction_mm2.4)
# Visualisation
graph <- fitted2 %>% group_by(year) %>% summarise_all(sum) %>%
ggplot(aes(x = year)) +
geom_line(aes(y = amount_apc_EUR_trusted, color = "observed values")) +
geom_point(aes(y = amount_apc_EUR_trusted), col = "red", size = 1.2) +
geom_line(aes(y = prediction_mm2.1, color = "discipline")) +
geom_line(aes(y = prediction_mm2.2, color = "tier")) +
geom_line(aes(y = prediction_mm2.3, color = "covered by Couperin")) +
geom_line(aes(y = prediction_mm2.4, color = "OA color")) +
scale_color_manual(values = c('observed values' = "red",
"discipline" = "#009E73",
"tier" = "#000000",
"covered by Couperin" = "#0072B2",
"OA color" = "#FFCC00"
)) +
labs(color = "", y = "Total cost of APCs", title = "Somme des APC estimés par les modèles, par année
(Articles avec un auteur correspondant français)", colour = "Modélisations") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
theme_classic() +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(aes(y = amount_apc_EUR_trusted, label = paste(" ", format(round(amount_apc_EUR_trusted / 1e6, 1), trim = TRUE), "M€")),
color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("21.models_frenchCA_sum", graph)

# Visualisation
graph <- fitted2 %>% group_by(year) %>% summarise_all(.funs = c(mean="mean")) %>% mutate(year = as.numeric(as.character(year))) %>%
ggplot(aes(x = year)) +
geom_line(aes(y = amount_apc_EUR_trusted_mean, color = "observed values")) +
geom_point(aes(y = amount_apc_EUR_trusted_mean), col = "red", size = 1.2) +
geom_line(aes(y = prediction_mm2.1_mean, color = "discipline")) +
geom_line(aes(y = prediction_mm2.2_mean, color = "tier")) +
geom_line(aes(y = prediction_mm2.3_mean, color = "covered by Couperin")) +
geom_line(aes(y = prediction_mm2.4_mean, color = "OA color")) +
scale_color_manual(values = c('observed values' = "red",
"discipline" = "#009E73",
"tier" = "#000000",
"covered by Couperin" = "#0072B2",
"OA color" = "#FFCC00"
)) +
labs(color = "", y = "Average APC", title = "Moyenne des APC estimés par les modèles, par année
(Articles avec un auteur correspondant français)", colour = "Modélisations") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks=seq(2013, 2020, 1)) +
theme_classic() +
theme(legend.position = "right", axis.title.x = element_blank()) +
ggrepel::geom_text_repel(aes(y = amount_apc_EUR_trusted_mean, label = paste(" ", round(amount_apc_EUR_trusted_mean, 0))), color = '#333333', hjust=-.25, nudge_y = 5, nudge_x = -.1, size=3.5)
graph
saving_plot("22.models_frenchCA_mean", graph)

Visuellement parlant, la meilleure estimation semble être celle réalisée à partir du deuxième modèle où la source de variation est le tier.
error_year <- fitted2 %>% group_by(year) %>% summarise_all(sum) %>%
mutate(error_discipline = round(abs(prediction_mm2.1 - amount_apc_EUR_trusted), 0),
error_tier = round(abs(prediction_mm2.2 - amount_apc_EUR_trusted), 0),
error_covered_couperin = round(abs(prediction_mm2.3 - amount_apc_EUR_trusted), 0),
error_OAcolor = round(abs(prediction_mm2.4 - amount_apc_EUR_trusted), 0)) %>%
select(1, 7:10) %>% mutate(year = as.character(year)) %>%
bind_rows(summarise_all(., ~if(is.numeric(.)) sum(.) else "Somme des erreurs")) %>%
bind_rows(summarise_all(., ~if(is.numeric(.)) mean(.) else "Moyenne des erreurs")) %>%
mutate_if(is.numeric, round, digits = 0)
error_year %>% DT::datatable(options = list(searching = F), width = '60%')
Ici nous avons calculé le montant total d’APC payés par année selon le BSO (valeurs réelles) et selon les 4 modèles estimés. Nous avons ensuite soustrait les montants réels aux montants prédits par les modèles (en gardant une valeur absolue), de manière à obtenir une erreur de prévision par année. Enfin, nous avons calculé la somme et la moyenne de ces erreurs, ce qui nous permet de constater que le meilleur modèle est celui où le tier est la source de variation / l’effet aléatoire. C’est donc à partir de ce modèle que nous estimons le montant total d’APC pour les années 2021-2030, tendances inchangées.
# Tableau récapitulatif
recapitulatif2 <- data.frame(
fix.ef = c("year",
"year",
"year",
"year"),
ran.ef = c("bso_classification",
"tier",
"is_covered_by_couperin",
"oa_color_finale"),
AIC = c(AIC(mm2.1), AIC(mm2.2), AIC(mm2.3), AIC(mm2.4)),
BIC = c(BIC(mm2.1), BIC(mm2.2), BIC(mm2.3), BIC(mm2.4)),
MAE = c(mae_mm2.1, mae_mm2.2, mae_mm2.3, mae_mm2.4),
MDAE = c(mdae_mm2.1, mdae_mm2.2, mdae_mm2.3, mdae_mm2.4),
SAE = c(sae_mm2.1, sae_mm2.2, sae_mm2.3, sae_mm2.4))
# Table finale
# on arrondit les mesures de qualité
recapitulatif2 <- recapitulatif2 %>% mutate_if(is.numeric, round, digits = 0)
# on affiche le tableau
recapitulatif2 %>% DT::datatable(options = list(searching = F), width = '60%')
On trouve dans cette table récapitulative différentes mesures de la qualité des modèles. Les critères AIC, BIC ainsi que les mesures d’erreurs MAE (mean absolute error), MDAE (median absolute error) et SAE (sum absolute error) doivent être minimisés. Le classement des modèles au vu de ces statistiques n’est pas le même d’une mesure à une autre, ainsi :
#devtools::install_github(c("ramnathv/htmlwidgets", "smartinsightsfromdata/rpivotTable"))
library(rpivotTable)
bso_pop <- bso_pop %>% mutate(year = year + 2013)
rpivotTable(bso_pop, rows = "year", cols = c("tier"), width = "100%", height = "400px")